import pickle
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE
from scipy import stats
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
El objetivo de este proyecto consiste en lograr predecir correctamente 2 atributos que tiene un videojuego posteriormente a su lanzamiento, su número de ventas y las valoraciones de los usuarios. Se intentan predecir estos atributos ya que son importantes a la hora de decidir si conviene invertir en el juego, son, en el fondo, una buena medida del éxito del juego. Hay que precisar que se tiene que hacer una clasificación en 5 clases en las valoraciones de los usuarios y una regresión en el caso del número de ventas.
Los datos que proveen es un dataset con 7881 ejemplos que describen información respecto del videojuego, sus categorías, género, publicador, descripciones, etc. Son 14 atributos y 2 variables objetivos , la primera las ventas estimadas y la segunda 5 categorías del rating que le dieron los usuarios al juego.
La primera tarea se evalua en base a la métrica r2 ya que esta permite medir el coeficiente de correlación entre los valores predichos y los reales que tiene la ventaja de poder expresarse entre 0(correlación nula) y 1(correlación perfecta) y es bastante intuitivo.
La segunda tarea se evalua en base a la métrica f1-score debido a que es una relación armónica entre precision y recall y por ende evita que se elijan clasficadores que maximicen solo 1 de estas métricas. Además f1-score es la métrica de facto para tareas de clasificación.
Nuestra propuesta para resolver el problema consistieron en modelo basado en RandomForest en el caso del clasificador, esto debido a la excelencia de RandomForest para clasificar en casos generales, además de su rapidez. En el caso del regresor se utilizó XGBoost ya que se quería probar con algo nuevo y debido a que en Internet se describía XGBoost como un modelo que ensamblaba varios y que no requería mucha optimización ni conocimiento específico del problema.
Nuestro modelo de clasificación cumplió las expectativas, haciendolo bastante bien. En cambio, nuestro modelo regresor no funcionó tan bien como habriamos esperado, teniendo resultados menores a los de varios participantes pero igual quedando por encima del Baseline del curso.
with open("train.pickle", "rb") as f:
df_train = pickle.load(f)
with open("test.pickle", "rb") as f:
df_test = pickle.load(f)
df_train["rating_numeric"] = df_train["rating"].map({'Negative': 0., 'Mixed': 1., 'Mostly Positive': 2., 'Positive': 3., 'Very Positive': 4.}).astype(float)
def describe_dataset(df_train):
display(df_train)
display(df_train.describe())
print(f'Duplicados: {df_train[df_train.duplicated(subset=["name"])].shape[0]}')
describe_dataset(df_train)
| name | release_date | english | developer | publisher | platforms | required_age | categories | genres | tags | achievements | average_playtime | price | short_description | estimated_sells | rating | rating_numeric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | An Aspie Life | 2018-03-29 | 1 | Bradley Hennessey;Joe Watson | EnderLost Studios | windows | 0 | Single-player;Steam Achievements | Adventure;Casual;Free to Play;Indie;Simulation | Free to Play;Adventure;Indie | 23 | 0 | 0.00 | One day your roommate Leaves for no reason. Yo... | 3914 | Mixed | 1.0 |
| 1 | GhostControl Inc. | 2014-06-06 | 1 | bumblebee | Application Systems Heidelberg | windows;mac;linux | 0 | Single-player;Steam Achievements;Steam Trading... | Casual;Indie;Simulation;Strategy | Turn-Based;Indie;Simulation | 53 | 65 | 10.99 | Manage a team of ghosthunters and free London ... | 10728 | Mixed | 1.0 |
| 2 | Deponia | 2012-08-06 | 1 | Daedalic Entertainment | Daedalic Entertainment | windows;mac;linux | 0 | Single-player;Steam Achievements;Steam Trading... | Adventure;Indie | Adventure;Point & Click;Comedy | 19 | 217 | 6.99 | In Deponia, the world has degenerated into a v... | 635792 | Positive | 3.0 |
| 3 | Atlas Reactor | 2016-10-04 | 1 | Trion Worlds | Trion Worlds | windows | 0 | Multi-player;Online Multi-Player;Steam Achieve... | Free to Play;Strategy | Free to Play;Multiplayer;Strategy | 121 | 1240 | 0.00 | SEASON 6 NOW LIVE! The battle for Atlas contin... | 253864 | Positive | 3.0 |
| 4 | CHUCHEL | 2018-03-07 | 1 | Amanita Design | Amanita Design | windows;mac | 0 | Single-player;Steam Achievements;Steam Trading... | Adventure;Casual;Indie | Adventure;Indie;Casual | 7 | 245 | 7.99 | CHUCHEL is a comedy adventure game from the cr... | 49818 | Mostly Positive | 2.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7876 | KOEWOTAYORINI / 声之寄托 | 2018-03-26 | 0 | あみそ組 | Pujia8 Studio;Gamera Game | windows | 0 | Single-player;Steam Achievements;Steam Cloud | Adventure;Free to Play;Indie;RPG | Free to Play;Adventure;Anime | 20 | 65 | 0.00 | “喂喂,求求你啦!救救我!” ——你接到这样一个电话 发出这样的求救,给你打电话的人,是一名... | 24130 | Positive | 3.0 |
| 7877 | Montaro | 2016-07-25 | 1 | JCKSLAP | MBDL | windows | 0 | Single-player;Steam Achievements;Steam Trading... | Casual;Indie | Memes;Cute;Casual | 15 | 174 | 0.79 | Montaro is a DOGE. | 550368 | Very Positive | 4.0 |
| 7878 | Moe Jigsaw | 2018-03-23 | 1 | ARES Inc. | ARES Inc. | windows | 0 | Single-player;Steam Achievements;Steam Trading... | Casual;Indie | Casual;Nudity;Indie | 72 | 0 | 2.89 | "Moe Jigsaw" is the definitive versi... | 10906 | Mostly Positive | 2.0 |
| 7879 | Drunkn Bar Fight | 2016-11-28 | 1 | The Munky | The Munky | windows | 0 | Single-player;Multi-player;Online Multi-Player... | Action;Indie;Early Access | Early Access;Action;Indie | 0 | 0 | 10.99 | VR PARTY GAMEDrunkn Bar Fight is a simple, imm... | 18876 | Mostly Positive | 2.0 |
| 7880 | Intake | 2013-11-06 | 1 | Cipher Prime Studios | Cipher Prime Studios | windows;mac | 0 | Single-player;Steam Achievements;Steam Cloud;S... | Action;Indie | Indie;Action;Great Soundtrack | 77 | 75 | 6.99 | Intake is the new retro-futuristic drugstep ar... | 29625 | Very Positive | 4.0 |
7881 rows × 17 columns
| english | required_age | achievements | average_playtime | price | estimated_sells | rating_numeric | |
|---|---|---|---|---|---|---|---|
| count | 7881.000000 | 7881.00000 | 7881.000000 | 7881.000000 | 7881.000000 | 7.881000e+03 | 7881.000000 |
| mean | 0.985789 | 0.78924 | 43.170156 | 439.296790 | 8.431342 | 2.105767e+05 | 2.023982 |
| std | 0.118369 | 3.55538 | 265.399206 | 3303.162083 | 8.755668 | 1.513926e+06 | 1.315201 |
| min | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 3.600000e+03 | 0.000000 |
| 25% | 1.000000 | 0.00000 | 0.000000 | 0.000000 | 1.990000 | 9.724000e+03 | 1.000000 |
| 50% | 1.000000 | 0.00000 | 15.000000 | 27.000000 | 6.990000 | 2.150800e+04 | 2.000000 |
| 75% | 1.000000 | 0.00000 | 35.000000 | 251.000000 | 11.390000 | 7.357300e+04 | 3.000000 |
| max | 1.000000 | 18.00000 | 9821.000000 | 190625.000000 | 78.990000 | 7.944129e+07 | 4.000000 |
Duplicados: 0
Se puede notar que hay 7881 juegos, de los cuales la gran mayoría esta en ingles por lo que esta variable podría no ser muy relevante. Además que todos los juegos se pueden jugar desde los 18 y la mayoría de los juegos no necesita una edad para poder ser jugado. Se puede notar tambien que los precios en general no son muy caros, teniendo en cuenta que la media es de 8 dolares pero que hay algunos que se escapan de esto teniendo precios tan altos como 79 dolares. Respecto a la cantidad de ventas se puede ver que existe una alta desviación estandar lo que podría estar asociado a que a la mayoría de los juegos les va a ir mas o menos mediocre mientras que algunos juegos tendrán un alto éxito lo que puede dificultar esta clasificación.
Por último, respecto al rating se puede encontar que en general las clases parecieran estar bastante balanceadas teniendo en cuenta los valores de los cuartiles.
También es relevante notar que no hay duplicados.
def show_correlations(df_train):
corr = df_train.corr()
display(corr)
px.imshow(corr)
show_correlations(df_train)
| english | required_age | achievements | average_playtime | price | estimated_sells | rating_numeric | |
|---|---|---|---|---|---|---|---|
| english | 1.000000 | 0.015799 | 0.009748 | -0.007678 | 0.018056 | 0.014292 | -0.037753 |
| required_age | 0.015799 | 1.000000 | -0.001500 | 0.026692 | 0.157875 | 0.110789 | 0.041111 |
| achievements | 0.009748 | -0.001500 | 1.000000 | 0.000825 | -0.026374 | 0.022180 | -0.020295 |
| average_playtime | -0.007678 | 0.026692 | 0.000825 | 1.000000 | 0.045651 | 0.161612 | 0.031250 |
| price | 0.018056 | 0.157875 | -0.026374 | 0.045651 | 1.000000 | 0.062935 | 0.090508 |
| estimated_sells | 0.014292 | 0.110789 | 0.022180 | 0.161612 | 0.062935 | 1.000000 | 0.090395 |
| rating_numeric | -0.037753 | 0.041111 | -0.020295 | 0.031250 | 0.090508 | 0.090395 | 1.000000 |
Respecto al número de ventas, se puede notar que no hay una correlación muy notoria entre las variables númericas, pero si hay algunas variables que parecieran estar más vinculadas con la cantidad de ventas, por ejemplo la edad requerida (juegos con edades requeridas menores pueden ser menos atractivos) y tiempo de juego, a mayor tiempo de juego mayor debe ser el desarrollo del juego y por ende puede ser más atractivo.
Respecto al rating se puede notar que casi no hyn ninguna correlación pero las que tienen una correlación positiva más importante es el precio y el número de ventas.
def histograms_categorical(df_train):
developer = px.histogram(df_train, x="developer").update_xaxes(categoryorder='total ascending')
publisher = px.histogram(df_train, x="publisher").update_xaxes(categoryorder='total ascending')
developer2 = px.histogram(df_train, x="developer", y="estimated_sells", histfunc="avg").update_xaxes(categoryorder='total ascending')
publisher2 = px.histogram(df_train, x="publisher", y="estimated_sells", histfunc="avg").update_xaxes(categoryorder='total ascending')
developer3 = px.histogram(df_train, x="developer", y="rating_numeric", histfunc="avg").update_xaxes(categoryorder='total ascending')
publisher3 = px.histogram(df_train, x="publisher", y="rating_numeric", histfunc="avg").update_xaxes(categoryorder='total ascending')
developer.show()
publisher.show()
developer2.show()
publisher2.show()
developer3.show()
publisher3.show()
histograms_categorical(df_train)
Se puede notar que tanto quien publica como quien desarrolla el juego tienen una alta vinculación con la cantidad de ventas asociadas por la reputación que tienen, su publicidad, etc. Se puede observar que estos datos serán muy importantes para poder lograr buenos modelos. Para poder vectorizar tanto developer como publisher en la regresión podría ser importante tomar en cuenta cuantos juegos tiene cada uno de estos en vez de un onehotencode simple.
Se puede ver también que la relación con los ratings no es tan extrema como con las ventas, pero igualmente existe esta vinculación.
def histograms_numerical(df_train):
es = px.histogram(df_train, x="estimated_sells")
ra = px.histogram(df_train, x="rating_numeric")
h1 = px.histogram(df_train, x="required_age")
h2 = px.histogram(df_train, x="average_playtime")
h3 = px.histogram(df_train, x="price")
es.show()
ra.show()
h1.show()
h2.show()
h3.show()
histograms_numerical(df_train)
Podemos notar que historical sells tiene una gran cantidad de outliers al igual que average playtime y por ende deben eliminarse varios de estos outliers para que puedan servirle los datos a un clasificador. En cambio rating pareciera tener algo cercano a una distribución uniforme, con pequeñas diferencias que lo podrían acercar a una normal.
En cuanto a average playtime tambien existe una cantidad de outliers grande pero que tal vez se podrían agrupar, mientras que price tiene una distribución no representable por una normal pero sin muchos outliers.
El caso de required age es más extraño porque la mayoria de los juegos no tienen edad, sin embargo, los juegos que si tienen edad son muy relevantes en cuanto a número de ventas como se puede saber por las correlaciones.
def histograms_sells(df_train):
es1 = px.histogram(df_train, x="required_age", y="estimated_sells", histfunc="avg")
es2 = px.histogram(df_train, x="average_playtime", y="estimated_sells", histfunc="avg")
es3 = px.histogram(df_train, x="price", y="estimated_sells", histfunc="avg")
es1.show()
es2.show()
es3.show()
histograms_sells(df_train)
Se puede notar una correlación clara entre las ventas y los juegos que son +18, incluso teniendo en cuenta que muy pocos juegos son +18, lo que es interesantisimo porque significa que se hacen muchos juegos sin limite de edad cuando los juegos con limite de edad tienen más ventas. Las segunda correlación del average playtime también es notoria pero menos que la anterior y por último la del precio no es muy distinguible.
def histograms_rating(df_train):
r1 = px.histogram(df_train, x="estimated_sells", y="rating_numeric", histfunc="avg")
r2 = px.histogram(df_train, x="price", y="rating_numeric", histfunc="avg")
r1.show()
r2.show()
histograms_rating(df_train)
Es díficil notar alguna correlación muy directa, pero si podemos ver que a mayor precio mayor en general estan los promedios más altos.
def plot_tsne(X, df_train, label):
fig = px.scatter(
X, x=0, y=1,
color=df_train[label], labels={'color': label}
)
fig.show()
def twod_proyections(df_train):
X = TSNE(init="random", learning_rate="auto").fit_transform(df_train.select_dtypes(include=np.number))
plot_tsne(X, df_train, "estimated_sells")
plot_tsne(X, df_train, "rating")
twod_proyections(df_train)
Al proyectar en 2D para el caso de las ventas estimadas no se puede encontrar ninguna relación ya que los outliers no nos dejan hacer diferenciación del color.
En cambio en el caso de los ratings, al proyectar en 2D encontramos que solo las columnas númericas no nos permiten hacer la separación clara entre clusters de manera exitosa, ya que cada cluster contiene varias de las categorías sin una separacion clara. Solo se puede determinr facilmente un par de clusters que son mayoritariamente negative y otros que son mayotitariamente very positive.
def whole_exploration(df_train):
describe_dataset(df_train)
show_correlations(df_train)
histograms_categorical(df_train)
histograms_numerical(df_train)
histograms_sells(df_train)
histograms_rating(df_train)
twod_proyections(df_train)
Para preparar nuestros datos, no eliminaremos duplicados debido a que no existen, y respecto a los outliers eliminaremos outliers solo para el caso de la cantidad de ventas ya que efectivamente hay juegos que tienen demasiadas ventas en comparación a otros y eliminar estos outliers nos da un número de ventas con una desviación estandar no tan alta.
df_clean = df_train[(np.abs(stats.zscore(df_train.loc[:, ["estimated_sells", "average_playtime"]]) < 1).all(axis=1))]
print(f"Datos eliminados: {df_train.shape[0] - df_clean.shape[0]}")
Datos eliminados: 257
Después, hacemos la exploración de datos denuevo teniendo en cuenta los outliers expulsados:
whole_exploration(df_clean)
| name | release_date | english | developer | publisher | platforms | required_age | categories | genres | tags | achievements | average_playtime | price | short_description | estimated_sells | rating | rating_numeric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | An Aspie Life | 2018-03-29 | 1 | Bradley Hennessey;Joe Watson | EnderLost Studios | windows | 0 | Single-player;Steam Achievements | Adventure;Casual;Free to Play;Indie;Simulation | Free to Play;Adventure;Indie | 23 | 0 | 0.00 | One day your roommate Leaves for no reason. Yo... | 3914 | Mixed | 1.0 |
| 1 | GhostControl Inc. | 2014-06-06 | 1 | bumblebee | Application Systems Heidelberg | windows;mac;linux | 0 | Single-player;Steam Achievements;Steam Trading... | Casual;Indie;Simulation;Strategy | Turn-Based;Indie;Simulation | 53 | 65 | 10.99 | Manage a team of ghosthunters and free London ... | 10728 | Mixed | 1.0 |
| 2 | Deponia | 2012-08-06 | 1 | Daedalic Entertainment | Daedalic Entertainment | windows;mac;linux | 0 | Single-player;Steam Achievements;Steam Trading... | Adventure;Indie | Adventure;Point & Click;Comedy | 19 | 217 | 6.99 | In Deponia, the world has degenerated into a v... | 635792 | Positive | 3.0 |
| 3 | Atlas Reactor | 2016-10-04 | 1 | Trion Worlds | Trion Worlds | windows | 0 | Multi-player;Online Multi-Player;Steam Achieve... | Free to Play;Strategy | Free to Play;Multiplayer;Strategy | 121 | 1240 | 0.00 | SEASON 6 NOW LIVE! The battle for Atlas contin... | 253864 | Positive | 3.0 |
| 4 | CHUCHEL | 2018-03-07 | 1 | Amanita Design | Amanita Design | windows;mac | 0 | Single-player;Steam Achievements;Steam Trading... | Adventure;Casual;Indie | Adventure;Indie;Casual | 7 | 245 | 7.99 | CHUCHEL is a comedy adventure game from the cr... | 49818 | Mostly Positive | 2.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7876 | KOEWOTAYORINI / 声之寄托 | 2018-03-26 | 0 | あみそ組 | Pujia8 Studio;Gamera Game | windows | 0 | Single-player;Steam Achievements;Steam Cloud | Adventure;Free to Play;Indie;RPG | Free to Play;Adventure;Anime | 20 | 65 | 0.00 | “喂喂,求求你啦!救救我!” ——你接到这样一个电话 发出这样的求救,给你打电话的人,是一名... | 24130 | Positive | 3.0 |
| 7877 | Montaro | 2016-07-25 | 1 | JCKSLAP | MBDL | windows | 0 | Single-player;Steam Achievements;Steam Trading... | Casual;Indie | Memes;Cute;Casual | 15 | 174 | 0.79 | Montaro is a DOGE. | 550368 | Very Positive | 4.0 |
| 7878 | Moe Jigsaw | 2018-03-23 | 1 | ARES Inc. | ARES Inc. | windows | 0 | Single-player;Steam Achievements;Steam Trading... | Casual;Indie | Casual;Nudity;Indie | 72 | 0 | 2.89 | "Moe Jigsaw" is the definitive versi... | 10906 | Mostly Positive | 2.0 |
| 7879 | Drunkn Bar Fight | 2016-11-28 | 1 | The Munky | The Munky | windows | 0 | Single-player;Multi-player;Online Multi-Player... | Action;Indie;Early Access | Early Access;Action;Indie | 0 | 0 | 10.99 | VR PARTY GAMEDrunkn Bar Fight is a simple, imm... | 18876 | Mostly Positive | 2.0 |
| 7880 | Intake | 2013-11-06 | 1 | Cipher Prime Studios | Cipher Prime Studios | windows;mac | 0 | Single-player;Steam Achievements;Steam Cloud;S... | Action;Indie | Indie;Action;Great Soundtrack | 77 | 75 | 6.99 | Intake is the new retro-futuristic drugstep ar... | 29625 | Very Positive | 4.0 |
7624 rows × 17 columns
| english | required_age | achievements | average_playtime | price | estimated_sells | rating_numeric | |
|---|---|---|---|---|---|---|---|
| count | 7624.000000 | 7624.000000 | 7624.000000 | 7624.000000 | 7624.000000 | 7.624000e+03 | 7624.000000 |
| mean | 0.985703 | 0.699895 | 42.283447 | 194.372770 | 8.270792 | 8.968644e+04 | 1.998426 |
| std | 0.118720 | 3.346910 | 268.334142 | 408.893233 | 8.532924 | 1.998071e+05 | 1.310108 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.600000e+03 | 0.000000 |
| 25% | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1.990000 | 9.503750e+03 | 1.000000 |
| 50% | 1.000000 | 0.000000 | 15.000000 | 18.000000 | 6.990000 | 2.024250e+04 | 2.000000 |
| 75% | 1.000000 | 0.000000 | 34.000000 | 232.000000 | 11.390000 | 6.485700e+04 | 3.000000 |
| max | 1.000000 | 18.000000 | 9821.000000 | 3736.000000 | 78.990000 | 1.709876e+06 | 4.000000 |
Duplicados: 0
| english | required_age | achievements | average_playtime | price | estimated_sells | rating_numeric | |
|---|---|---|---|---|---|---|---|
| english | 1.000000 | 0.013301 | 0.009920 | 0.006641 | 0.018885 | 0.037055 | -0.042316 |
| required_age | 0.013301 | 1.000000 | -0.007729 | 0.094450 | 0.143127 | 0.204879 | 0.026638 |
| achievements | 0.009920 | -0.007729 | 1.000000 | -0.000748 | -0.029537 | -0.000872 | -0.023986 |
| average_playtime | 0.006641 | 0.094450 | -0.000748 | 1.000000 | 0.228304 | 0.426791 | 0.080054 |
| price | 0.018885 | 0.143127 | -0.029537 | 0.228304 | 1.000000 | 0.186764 | 0.087657 |
| estimated_sells | 0.037055 | 0.204879 | -0.000872 | 0.426791 | 0.186764 | 1.000000 | 0.203393 |
| rating_numeric | -0.042316 | 0.026638 | -0.023986 | 0.080054 | 0.087657 | 0.203393 | 1.000000 |
Se puede notar que efectivamente mejora en cuantía la visualización en 2d de los clusters (sobre todo en la regresión) y que además ahora los histogramas tienen mas sentido.
Luego, preparamos un ColumnTransformer que prepara los datos:
class SeparatorTransformer(BaseEstimator, TransformerMixin):
def __init__(self, feature_name, separator=";"):
self.feature_name = feature_name
self.separator = ";"
def fit(self, X, y = None):
self.all_categories = set(self.separator.join(set(X[self.feature_name].values)).split(self.separator))
self.all_categories = np.array(list(self.all_categories))
self.ohe = OneHotEncoder(handle_unknown="ignore")
self.ohe.fit(self.all_categories.reshape((-1, 1)))
return self
def transform(self, X, y = None):
splitted = X[self.feature_name].str.split(";", expand=True)
encoded = np.zeros((X.shape[0], len(self.all_categories)))
for _, content in splitted.items():
transformed = self.ohe.transform(content.to_numpy().reshape(-1, 1))
encoded += transformed
return np.asarray(encoded)
one_hot_encode_columns = ["developer", "publisher", "platforms", "categories", "genres", "tags"]
numeric_columns = ["achievements", "average_playtime", "price"]
transformers = [(f"ohe{i}", SeparatorTransformer(column), [column]) for i, column in enumerate(one_hot_encode_columns)]
transformers += [("num", StandardScaler(), numeric_columns)]
transformers += [('passthough', FunctionTransformer(lambda x: x, lambda x: x), ['english'])]
transformers += [("tfidf", TfidfVectorizer(), "short_description")]
preprocessor = ColumnTransformer(transformers=transformers)
Adjuntamos las transformaciones anteriores en un solo Pipeline y agregamos al final un clasificador sencillo ClasificadorX
clf_classifier = Pipeline(
steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)
clf_regressor = Pipeline(
steps=[("preprocessor", preprocessor), ("regressor", XGBRegressor())]
)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(df_clean, df_clean["rating"], test_size=0.1, random_state=1)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(df_clean, df_clean["estimated_sells"], test_size=0.1, random_state=1)
clf_classifier.fit(X_train_c, y_train_c)
clf_regressor = clf_regressor.fit(X_train_r, y_train_r)
y_pred_c = clf_classifier.predict(X_test_c)
print(classification_report(y_test_c, y_pred_c))
precision recall f1-score support
Mixed 0.29 0.26 0.27 162
Mostly Positive 0.20 0.20 0.20 136
Negative 0.50 0.31 0.38 132
Positive 0.35 0.62 0.44 211
Very Positive 0.78 0.17 0.28 122
accuracy 0.34 763
macro avg 0.42 0.31 0.32 763
weighted avg 0.40 0.34 0.33 763
y_pred_r = clf_regressor.predict(X_test_r)
print(r2_score(y_test_r, y_pred_r))
0.3592424872181571
y_pred_competition_r = clf_regressor.predict(df_test)
y_pred_competition_c = clf_classifier.predict(df_test)
Ahora, agregamos selección y/o reducción de atributos y probaremos una combinación de diferentes parámetros para mejorar nuestro modelo:
Nota: Procuren hacer Gridsearch sobre espacios de búsqueda razonables. Es decir, no estén 3 días buscando la mejor configuración...
Eliminaremos de las caracteristicas que enviamos el tfidif de las descripciones, el platform y si se juega en ingles o no porque no creemos que aporten información relevante
one_hot_encode_columns = ["developer", "publisher", "categories", "genres", "tags"]
numeric_columns = ["achievements", "average_playtime", "price"]
transformers = [(f"ohe{i}", SeparatorTransformer(column), [column]) for i, column in enumerate(one_hot_encode_columns)]
transformers += [("num", StandardScaler(), numeric_columns)]
preprocessor = ColumnTransformer(transformers=transformers)
clf_classifier = Pipeline(
steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)
clf_regressor = Pipeline(
steps=[("preprocessor", preprocessor), ("regressor", XGBRegressor())]
)
param_grid_clf = {"classifier__n_estimators": [50, 100, 200], "classifier__max_depth": [None, 50, 100, 200]}
gs_clf = HalvingGridSearchCV(clf_classifier, param_grid_clf, verbose=10)
gs_clf = gs_clf.fit(df_train, df_train["rating"])
n_iterations: 3 n_required_iterations: 3 n_possible_iterations: 3 min_resources_: 875 max_resources_: 7881 aggressive_elimination: False factor: 3 ---------- iter: 0 n_candidates: 12 n_resources: 875 Fitting 5 folds for each of 12 candidates, totalling 60 fits [CV 1/5; 1/12] START classifier__max_depth=None, classifier__n_estimators=50.... [CV 1/5; 1/12] END classifier__max_depth=None, classifier__n_estimators=50;, score=(train=1.000, test=0.331) total time= 0.3s [CV 2/5; 1/12] START classifier__max_depth=None, classifier__n_estimators=50.... [CV 2/5; 1/12] END classifier__max_depth=None, classifier__n_estimators=50;, score=(train=1.000, test=0.293) total time= 0.4s [CV 3/5; 1/12] START classifier__max_depth=None, classifier__n_estimators=50.... [CV 3/5; 1/12] END classifier__max_depth=None, classifier__n_estimators=50;, score=(train=1.000, test=0.287) total time= 0.4s [CV 4/5; 1/12] START classifier__max_depth=None, classifier__n_estimators=50.... [CV 4/5; 1/12] END classifier__max_depth=None, classifier__n_estimators=50;, score=(train=1.000, test=0.270) total time= 0.4s [CV 5/5; 1/12] START classifier__max_depth=None, classifier__n_estimators=50.... [CV 5/5; 1/12] END classifier__max_depth=None, classifier__n_estimators=50;, score=(train=1.000, test=0.236) total time= 0.3s [CV 1/5; 2/12] START classifier__max_depth=None, classifier__n_estimators=100... [CV 1/5; 2/12] END classifier__max_depth=None, classifier__n_estimators=100;, score=(train=1.000, test=0.360) total time= 0.5s [CV 2/5; 2/12] START classifier__max_depth=None, classifier__n_estimators=100... [CV 2/5; 2/12] END classifier__max_depth=None, classifier__n_estimators=100;, score=(train=1.000, test=0.287) total time= 0.5s [CV 3/5; 2/12] START classifier__max_depth=None, classifier__n_estimators=100... [CV 3/5; 2/12] END classifier__max_depth=None, classifier__n_estimators=100;, score=(train=1.000, test=0.293) total time= 0.5s [CV 4/5; 2/12] START classifier__max_depth=None, classifier__n_estimators=100... [CV 4/5; 2/12] END classifier__max_depth=None, classifier__n_estimators=100;, score=(train=1.000, test=0.259) total time= 0.5s [CV 5/5; 2/12] START classifier__max_depth=None, classifier__n_estimators=100... [CV 5/5; 2/12] END classifier__max_depth=None, classifier__n_estimators=100;, score=(train=1.000, test=0.259) total time= 0.5s [CV 1/5; 3/12] START classifier__max_depth=None, classifier__n_estimators=200... [CV 1/5; 3/12] END classifier__max_depth=None, classifier__n_estimators=200;, score=(train=1.000, test=0.320) total time= 0.9s [CV 2/5; 3/12] START classifier__max_depth=None, classifier__n_estimators=200... [CV 2/5; 3/12] END classifier__max_depth=None, classifier__n_estimators=200;, score=(train=1.000, test=0.339) total time= 1.0s [CV 3/5; 3/12] START classifier__max_depth=None, classifier__n_estimators=200... [CV 3/5; 3/12] END classifier__max_depth=None, classifier__n_estimators=200;, score=(train=1.000, test=0.328) total time= 0.9s [CV 4/5; 3/12] START classifier__max_depth=None, classifier__n_estimators=200... [CV 4/5; 3/12] END classifier__max_depth=None, classifier__n_estimators=200;, score=(train=1.000, test=0.270) total time= 1.0s [CV 5/5; 3/12] START classifier__max_depth=None, classifier__n_estimators=200... [CV 5/5; 3/12] END classifier__max_depth=None, classifier__n_estimators=200;, score=(train=1.000, test=0.299) total time= 0.9s [CV 1/5; 4/12] START classifier__max_depth=50, classifier__n_estimators=50...... [CV 1/5; 4/12] END classifier__max_depth=50, classifier__n_estimators=50;, score=(train=1.000, test=0.331) total time= 0.3s [CV 2/5; 4/12] START classifier__max_depth=50, classifier__n_estimators=50...... [CV 2/5; 4/12] END classifier__max_depth=50, classifier__n_estimators=50;, score=(train=1.000, test=0.299) total time= 0.3s [CV 3/5; 4/12] START classifier__max_depth=50, classifier__n_estimators=50...... [CV 3/5; 4/12] END classifier__max_depth=50, classifier__n_estimators=50;, score=(train=1.000, test=0.310) total time= 0.3s [CV 4/5; 4/12] START classifier__max_depth=50, classifier__n_estimators=50...... [CV 4/5; 4/12] END classifier__max_depth=50, classifier__n_estimators=50;, score=(train=1.000, test=0.287) total time= 0.4s [CV 5/5; 4/12] START classifier__max_depth=50, classifier__n_estimators=50...... [CV 5/5; 4/12] END classifier__max_depth=50, classifier__n_estimators=50;, score=(train=1.000, test=0.287) total time= 0.4s [CV 1/5; 5/12] START classifier__max_depth=50, classifier__n_estimators=100..... [CV 1/5; 5/12] END classifier__max_depth=50, classifier__n_estimators=100;, score=(train=1.000, test=0.303) total time= 0.5s [CV 2/5; 5/12] START classifier__max_depth=50, classifier__n_estimators=100..... [CV 2/5; 5/12] END classifier__max_depth=50, classifier__n_estimators=100;, score=(train=1.000, test=0.322) total time= 0.6s [CV 3/5; 5/12] START classifier__max_depth=50, classifier__n_estimators=100..... [CV 3/5; 5/12] END classifier__max_depth=50, classifier__n_estimators=100;, score=(train=1.000, test=0.368) total time= 0.6s [CV 4/5; 5/12] START classifier__max_depth=50, classifier__n_estimators=100..... [CV 4/5; 5/12] END classifier__max_depth=50, classifier__n_estimators=100;, score=(train=1.000, test=0.316) total time= 0.6s [CV 5/5; 5/12] START classifier__max_depth=50, classifier__n_estimators=100..... [CV 5/5; 5/12] END classifier__max_depth=50, classifier__n_estimators=100;, score=(train=1.000, test=0.368) total time= 0.6s [CV 1/5; 6/12] START classifier__max_depth=50, classifier__n_estimators=200..... [CV 1/5; 6/12] END classifier__max_depth=50, classifier__n_estimators=200;, score=(train=1.000, test=0.309) total time= 1.0s [CV 2/5; 6/12] START classifier__max_depth=50, classifier__n_estimators=200..... [CV 2/5; 6/12] END classifier__max_depth=50, classifier__n_estimators=200;, score=(train=1.000, test=0.351) total time= 0.9s [CV 3/5; 6/12] START classifier__max_depth=50, classifier__n_estimators=200..... [CV 3/5; 6/12] END classifier__max_depth=50, classifier__n_estimators=200;, score=(train=1.000, test=0.333) total time= 0.9s [CV 4/5; 6/12] START classifier__max_depth=50, classifier__n_estimators=200..... [CV 4/5; 6/12] END classifier__max_depth=50, classifier__n_estimators=200;, score=(train=1.000, test=0.270) total time= 0.9s [CV 5/5; 6/12] START classifier__max_depth=50, classifier__n_estimators=200..... [CV 5/5; 6/12] END classifier__max_depth=50, classifier__n_estimators=200;, score=(train=1.000, test=0.328) total time= 0.8s [CV 1/5; 7/12] START classifier__max_depth=100, classifier__n_estimators=50..... [CV 1/5; 7/12] END classifier__max_depth=100, classifier__n_estimators=50;, score=(train=1.000, test=0.269) total time= 0.3s [CV 2/5; 7/12] START classifier__max_depth=100, classifier__n_estimators=50..... [CV 2/5; 7/12] END classifier__max_depth=100, classifier__n_estimators=50;, score=(train=1.000, test=0.316) total time= 0.3s [CV 3/5; 7/12] START classifier__max_depth=100, classifier__n_estimators=50..... [CV 3/5; 7/12] END classifier__max_depth=100, classifier__n_estimators=50;, score=(train=1.000, test=0.305) total time= 0.3s [CV 4/5; 7/12] START classifier__max_depth=100, classifier__n_estimators=50..... [CV 4/5; 7/12] END classifier__max_depth=100, classifier__n_estimators=50;, score=(train=1.000, test=0.236) total time= 0.4s [CV 5/5; 7/12] START classifier__max_depth=100, classifier__n_estimators=50..... [CV 5/5; 7/12] END classifier__max_depth=100, classifier__n_estimators=50;, score=(train=1.000, test=0.328) total time= 0.3s [CV 1/5; 8/12] START classifier__max_depth=100, classifier__n_estimators=100.... [CV 1/5; 8/12] END classifier__max_depth=100, classifier__n_estimators=100;, score=(train=1.000, test=0.309) total time= 0.5s [CV 2/5; 8/12] START classifier__max_depth=100, classifier__n_estimators=100.... [CV 2/5; 8/12] END classifier__max_depth=100, classifier__n_estimators=100;, score=(train=1.000, test=0.339) total time= 0.5s [CV 3/5; 8/12] START classifier__max_depth=100, classifier__n_estimators=100.... [CV 3/5; 8/12] END classifier__max_depth=100, classifier__n_estimators=100;, score=(train=1.000, test=0.305) total time= 0.5s [CV 4/5; 8/12] START classifier__max_depth=100, classifier__n_estimators=100.... [CV 4/5; 8/12] END classifier__max_depth=100, classifier__n_estimators=100;, score=(train=1.000, test=0.299) total time= 0.5s [CV 5/5; 8/12] START classifier__max_depth=100, classifier__n_estimators=100.... [CV 5/5; 8/12] END classifier__max_depth=100, classifier__n_estimators=100;, score=(train=1.000, test=0.287) total time= 0.5s [CV 1/5; 9/12] START classifier__max_depth=100, classifier__n_estimators=200.... [CV 1/5; 9/12] END classifier__max_depth=100, classifier__n_estimators=200;, score=(train=1.000, test=0.320) total time= 0.9s [CV 2/5; 9/12] START classifier__max_depth=100, classifier__n_estimators=200.... [CV 2/5; 9/12] END classifier__max_depth=100, classifier__n_estimators=200;, score=(train=1.000, test=0.339) total time= 0.9s [CV 3/5; 9/12] START classifier__max_depth=100, classifier__n_estimators=200.... [CV 3/5; 9/12] END classifier__max_depth=100, classifier__n_estimators=200;, score=(train=1.000, test=0.345) total time= 1.0s [CV 4/5; 9/12] START classifier__max_depth=100, classifier__n_estimators=200.... [CV 4/5; 9/12] END classifier__max_depth=100, classifier__n_estimators=200;, score=(train=1.000, test=0.282) total time= 0.9s [CV 5/5; 9/12] START classifier__max_depth=100, classifier__n_estimators=200.... [CV 5/5; 9/12] END classifier__max_depth=100, classifier__n_estimators=200;, score=(train=1.000, test=0.305) total time= 0.9s [CV 1/5; 10/12] START classifier__max_depth=200, classifier__n_estimators=50.... [CV 1/5; 10/12] END classifier__max_depth=200, classifier__n_estimators=50;, score=(train=1.000, test=0.314) total time= 0.3s [CV 2/5; 10/12] START classifier__max_depth=200, classifier__n_estimators=50.... [CV 2/5; 10/12] END classifier__max_depth=200, classifier__n_estimators=50;, score=(train=1.000, test=0.333) total time= 0.3s [CV 3/5; 10/12] START classifier__max_depth=200, classifier__n_estimators=50.... [CV 3/5; 10/12] END classifier__max_depth=200, classifier__n_estimators=50;, score=(train=1.000, test=0.293) total time= 0.3s [CV 4/5; 10/12] START classifier__max_depth=200, classifier__n_estimators=50.... [CV 4/5; 10/12] END classifier__max_depth=200, classifier__n_estimators=50;, score=(train=1.000, test=0.276) total time= 0.3s [CV 5/5; 10/12] START classifier__max_depth=200, classifier__n_estimators=50.... [CV 5/5; 10/12] END classifier__max_depth=200, classifier__n_estimators=50;, score=(train=1.000, test=0.270) total time= 0.3s [CV 1/5; 11/12] START classifier__max_depth=200, classifier__n_estimators=100... [CV 1/5; 11/12] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=1.000, test=0.371) total time= 0.5s [CV 2/5; 11/12] START classifier__max_depth=200, classifier__n_estimators=100... [CV 2/5; 11/12] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=1.000, test=0.345) total time= 0.6s [CV 3/5; 11/12] START classifier__max_depth=200, classifier__n_estimators=100... [CV 3/5; 11/12] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=1.000, test=0.310) total time= 0.5s [CV 4/5; 11/12] START classifier__max_depth=200, classifier__n_estimators=100... [CV 4/5; 11/12] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=1.000, test=0.310) total time= 0.6s [CV 5/5; 11/12] START classifier__max_depth=200, classifier__n_estimators=100... [CV 5/5; 11/12] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=1.000, test=0.305) total time= 0.6s [CV 1/5; 12/12] START classifier__max_depth=200, classifier__n_estimators=200... [CV 1/5; 12/12] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=1.000, test=0.331) total time= 0.9s [CV 2/5; 12/12] START classifier__max_depth=200, classifier__n_estimators=200... [CV 2/5; 12/12] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=1.000, test=0.351) total time= 1.0s [CV 3/5; 12/12] START classifier__max_depth=200, classifier__n_estimators=200... [CV 3/5; 12/12] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=1.000, test=0.316) total time= 0.8s [CV 4/5; 12/12] START classifier__max_depth=200, classifier__n_estimators=200... [CV 4/5; 12/12] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=1.000, test=0.310) total time= 0.9s [CV 5/5; 12/12] START classifier__max_depth=200, classifier__n_estimators=200... [CV 5/5; 12/12] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=1.000, test=0.305) total time= 0.9s ---------- iter: 1 n_candidates: 4 n_resources: 2625 Fitting 5 folds for each of 4 candidates, totalling 20 fits [CV 1/5; 1/4] START classifier__max_depth=50, classifier__n_estimators=200...... [CV 1/5; 1/4] END classifier__max_depth=50, classifier__n_estimators=200;, score=(train=1.000, test=0.307) total time= 4.4s [CV 2/5; 1/4] START classifier__max_depth=50, classifier__n_estimators=200...... [CV 2/5; 1/4] END classifier__max_depth=50, classifier__n_estimators=200;, score=(train=1.000, test=0.319) total time= 4.2s [CV 3/5; 1/4] START classifier__max_depth=50, classifier__n_estimators=200...... [CV 3/5; 1/4] END classifier__max_depth=50, classifier__n_estimators=200;, score=(train=1.000, test=0.288) total time= 4.2s [CV 4/5; 1/4] START classifier__max_depth=50, classifier__n_estimators=200...... [CV 4/5; 1/4] END classifier__max_depth=50, classifier__n_estimators=200;, score=(train=1.000, test=0.326) total time= 4.3s [CV 5/5; 1/4] START classifier__max_depth=50, classifier__n_estimators=200...... [CV 5/5; 1/4] END classifier__max_depth=50, classifier__n_estimators=200;, score=(train=1.000, test=0.277) total time= 4.2s [CV 1/5; 2/4] START classifier__max_depth=200, classifier__n_estimators=200..... [CV 1/5; 2/4] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=1.000, test=0.314) total time= 5.0s [CV 2/5; 2/4] START classifier__max_depth=200, classifier__n_estimators=200..... [CV 2/5; 2/4] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=1.000, test=0.345) total time= 4.8s [CV 3/5; 2/4] START classifier__max_depth=200, classifier__n_estimators=200..... [CV 3/5; 2/4] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=1.000, test=0.281) total time= 4.7s [CV 4/5; 2/4] START classifier__max_depth=200, classifier__n_estimators=200..... [CV 4/5; 2/4] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=1.000, test=0.305) total time= 4.9s [CV 5/5; 2/4] START classifier__max_depth=200, classifier__n_estimators=200..... [CV 5/5; 2/4] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=1.000, test=0.303) total time= 4.9s [CV 1/5; 3/4] START classifier__max_depth=200, classifier__n_estimators=100..... [CV 1/5; 3/4] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=1.000, test=0.291) total time= 2.6s [CV 2/5; 3/4] START classifier__max_depth=200, classifier__n_estimators=100..... [CV 2/5; 3/4] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=1.000, test=0.334) total time= 2.6s [CV 3/5; 3/4] START classifier__max_depth=200, classifier__n_estimators=100..... [CV 3/5; 3/4] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=1.000, test=0.275) total time= 2.5s [CV 4/5; 3/4] START classifier__max_depth=200, classifier__n_estimators=100..... [CV 4/5; 3/4] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=1.000, test=0.330) total time= 2.7s [CV 5/5; 3/4] START classifier__max_depth=200, classifier__n_estimators=100..... [CV 5/5; 3/4] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=1.000, test=0.321) total time= 2.6s [CV 1/5; 4/4] START classifier__max_depth=50, classifier__n_estimators=100...... [CV 1/5; 4/4] END classifier__max_depth=50, classifier__n_estimators=100;, score=(train=0.999, test=0.312) total time= 2.3s [CV 2/5; 4/4] START classifier__max_depth=50, classifier__n_estimators=100...... [CV 2/5; 4/4] END classifier__max_depth=50, classifier__n_estimators=100;, score=(train=0.999, test=0.296) total time= 2.4s [CV 3/5; 4/4] START classifier__max_depth=50, classifier__n_estimators=100...... [CV 3/5; 4/4] END classifier__max_depth=50, classifier__n_estimators=100;, score=(train=1.000, test=0.315) total time= 2.2s [CV 4/5; 4/4] START classifier__max_depth=50, classifier__n_estimators=100...... [CV 4/5; 4/4] END classifier__max_depth=50, classifier__n_estimators=100;, score=(train=1.000, test=0.309) total time= 2.2s [CV 5/5; 4/4] START classifier__max_depth=50, classifier__n_estimators=100...... [CV 5/5; 4/4] END classifier__max_depth=50, classifier__n_estimators=100;, score=(train=0.999, test=0.277) total time= 2.3s ---------- iter: 2 n_candidates: 2 n_resources: 7875 Fitting 5 folds for each of 2 candidates, totalling 10 fits [CV 1/5; 1/2] START classifier__max_depth=200, classifier__n_estimators=200..... [CV 1/5; 1/2] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=0.999, test=0.356) total time= 36.0s [CV 2/5; 1/2] START classifier__max_depth=200, classifier__n_estimators=200..... [CV 2/5; 1/2] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=0.999, test=0.358) total time= 35.0s [CV 3/5; 1/2] START classifier__max_depth=200, classifier__n_estimators=200..... [CV 3/5; 1/2] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=0.999, test=0.360) total time= 33.8s [CV 4/5; 1/2] START classifier__max_depth=200, classifier__n_estimators=200..... [CV 4/5; 1/2] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=0.999, test=0.356) total time= 35.8s [CV 5/5; 1/2] START classifier__max_depth=200, classifier__n_estimators=200..... [CV 5/5; 1/2] END classifier__max_depth=200, classifier__n_estimators=200;, score=(train=0.999, test=0.341) total time= 34.7s [CV 1/5; 2/2] START classifier__max_depth=200, classifier__n_estimators=100..... [CV 1/5; 2/2] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=0.999, test=0.334) total time= 20.3s [CV 2/5; 2/2] START classifier__max_depth=200, classifier__n_estimators=100..... [CV 2/5; 2/2] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=0.999, test=0.336) total time= 19.2s [CV 3/5; 2/2] START classifier__max_depth=200, classifier__n_estimators=100..... [CV 3/5; 2/2] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=0.999, test=0.357) total time= 20.1s [CV 4/5; 2/2] START classifier__max_depth=200, classifier__n_estimators=100..... [CV 4/5; 2/2] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=0.999, test=0.341) total time= 20.1s [CV 5/5; 2/2] START classifier__max_depth=200, classifier__n_estimators=100..... [CV 5/5; 2/2] END classifier__max_depth=200, classifier__n_estimators=100;, score=(train=0.999, test=0.333) total time= 26.6s
param_grid_reg = {'regressor__max_depth': [3,4,5], 'regressor__n_estimators': [50,100,200]}
gs_reg = HalvingGridSearchCV(clf_regressor, param_grid_reg, verbose=10)
gs_reg = gs_reg.fit(df_train, df_train["estimated_sells"])
n_iterations: 3 n_required_iterations: 3 n_possible_iterations: 3 min_resources_: 875 max_resources_: 7881 aggressive_elimination: False factor: 3 ---------- iter: 0 n_candidates: 9 n_resources: 875 Fitting 5 folds for each of 9 candidates, totalling 45 fits [CV 1/5; 1/9] START regressor__max_depth=3, regressor__n_estimators=50.......... [CV 1/5; 1/9] END regressor__max_depth=3, regressor__n_estimators=50;, score=(train=0.996, test=0.322) total time= 0.8s [CV 2/5; 1/9] START regressor__max_depth=3, regressor__n_estimators=50.......... [CV 2/5; 1/9] END regressor__max_depth=3, regressor__n_estimators=50;, score=(train=0.991, test=-4.001) total time= 0.7s [CV 3/5; 1/9] START regressor__max_depth=3, regressor__n_estimators=50.......... [CV 3/5; 1/9] END regressor__max_depth=3, regressor__n_estimators=50;, score=(train=0.996, test=0.418) total time= 0.8s [CV 4/5; 1/9] START regressor__max_depth=3, regressor__n_estimators=50.......... [CV 4/5; 1/9] END regressor__max_depth=3, regressor__n_estimators=50;, score=(train=0.977, test=0.381) total time= 0.8s [CV 5/5; 1/9] START regressor__max_depth=3, regressor__n_estimators=50.......... [CV 5/5; 1/9] END regressor__max_depth=3, regressor__n_estimators=50;, score=(train=0.937, test=0.021) total time= 0.8s [CV 1/5; 2/9] START regressor__max_depth=3, regressor__n_estimators=100......... [CV 1/5; 2/9] END regressor__max_depth=3, regressor__n_estimators=100;, score=(train=0.998, test=0.326) total time= 1.5s [CV 2/5; 2/9] START regressor__max_depth=3, regressor__n_estimators=100......... [CV 2/5; 2/9] END regressor__max_depth=3, regressor__n_estimators=100;, score=(train=0.997, test=-4.084) total time= 4.0s [CV 3/5; 2/9] START regressor__max_depth=3, regressor__n_estimators=100......... [CV 3/5; 2/9] END regressor__max_depth=3, regressor__n_estimators=100;, score=(train=0.998, test=0.417) total time= 1.5s [CV 4/5; 2/9] START regressor__max_depth=3, regressor__n_estimators=100......... [CV 4/5; 2/9] END regressor__max_depth=3, regressor__n_estimators=100;, score=(train=0.991, test=0.380) total time= 1.4s [CV 5/5; 2/9] START regressor__max_depth=3, regressor__n_estimators=100......... [CV 5/5; 2/9] END regressor__max_depth=3, regressor__n_estimators=100;, score=(train=0.971, test=0.019) total time= 1.5s [CV 1/5; 3/9] START regressor__max_depth=3, regressor__n_estimators=200......... [CV 1/5; 3/9] END regressor__max_depth=3, regressor__n_estimators=200;, score=(train=1.000, test=0.327) total time= 3.9s [CV 2/5; 3/9] START regressor__max_depth=3, regressor__n_estimators=200......... [CV 2/5; 3/9] END regressor__max_depth=3, regressor__n_estimators=200;, score=(train=0.999, test=-4.109) total time= 3.2s [CV 3/5; 3/9] START regressor__max_depth=3, regressor__n_estimators=200......... [CV 3/5; 3/9] END regressor__max_depth=3, regressor__n_estimators=200;, score=(train=0.999, test=0.415) total time= 3.2s [CV 4/5; 3/9] START regressor__max_depth=3, regressor__n_estimators=200......... [CV 4/5; 3/9] END regressor__max_depth=3, regressor__n_estimators=200;, score=(train=0.997, test=0.385) total time= 3.2s [CV 5/5; 3/9] START regressor__max_depth=3, regressor__n_estimators=200......... [CV 5/5; 3/9] END regressor__max_depth=3, regressor__n_estimators=200;, score=(train=0.990, test=0.019) total time= 3.2s [CV 1/5; 4/9] START regressor__max_depth=4, regressor__n_estimators=50.......... [CV 1/5; 4/9] END regressor__max_depth=4, regressor__n_estimators=50;, score=(train=0.998, test=0.291) total time= 0.9s [CV 2/5; 4/9] START regressor__max_depth=4, regressor__n_estimators=50.......... [CV 2/5; 4/9] END regressor__max_depth=4, regressor__n_estimators=50;, score=(train=0.996, test=-4.012) total time= 1.1s [CV 3/5; 4/9] START regressor__max_depth=4, regressor__n_estimators=50.......... [CV 3/5; 4/9] END regressor__max_depth=4, regressor__n_estimators=50;, score=(train=0.998, test=0.433) total time= 1.0s [CV 4/5; 4/9] START regressor__max_depth=4, regressor__n_estimators=50.......... [CV 4/5; 4/9] END regressor__max_depth=4, regressor__n_estimators=50;, score=(train=0.989, test=0.320) total time= 1.0s [CV 5/5; 4/9] START regressor__max_depth=4, regressor__n_estimators=50.......... [CV 5/5; 4/9] END regressor__max_depth=4, regressor__n_estimators=50;, score=(train=0.971, test=0.016) total time= 0.9s [CV 1/5; 5/9] START regressor__max_depth=4, regressor__n_estimators=100......... [CV 1/5; 5/9] END regressor__max_depth=4, regressor__n_estimators=100;, score=(train=0.999, test=0.292) total time= 1.6s [CV 2/5; 5/9] START regressor__max_depth=4, regressor__n_estimators=100......... [CV 2/5; 5/9] END regressor__max_depth=4, regressor__n_estimators=100;, score=(train=0.999, test=-4.062) total time= 1.7s [CV 3/5; 5/9] START regressor__max_depth=4, regressor__n_estimators=100......... [CV 3/5; 5/9] END regressor__max_depth=4, regressor__n_estimators=100;, score=(train=0.999, test=0.435) total time= 1.6s [CV 4/5; 5/9] START regressor__max_depth=4, regressor__n_estimators=100......... [CV 4/5; 5/9] END regressor__max_depth=4, regressor__n_estimators=100;, score=(train=0.996, test=0.330) total time= 1.7s [CV 5/5; 5/9] START regressor__max_depth=4, regressor__n_estimators=100......... [CV 5/5; 5/9] END regressor__max_depth=4, regressor__n_estimators=100;, score=(train=0.987, test=0.015) total time= 1.7s [CV 1/5; 6/9] START regressor__max_depth=4, regressor__n_estimators=200......... [CV 1/5; 6/9] END regressor__max_depth=4, regressor__n_estimators=200;, score=(train=1.000, test=0.292) total time= 3.2s [CV 2/5; 6/9] START regressor__max_depth=4, regressor__n_estimators=200......... [CV 2/5; 6/9] END regressor__max_depth=4, regressor__n_estimators=200;, score=(train=1.000, test=-4.074) total time= 3.3s [CV 3/5; 6/9] START regressor__max_depth=4, regressor__n_estimators=200......... [CV 3/5; 6/9] END regressor__max_depth=4, regressor__n_estimators=200;, score=(train=1.000, test=0.435) total time= 3.0s [CV 4/5; 6/9] START regressor__max_depth=4, regressor__n_estimators=200......... [CV 4/5; 6/9] END regressor__max_depth=4, regressor__n_estimators=200;, score=(train=0.999, test=0.342) total time= 6.3s [CV 5/5; 6/9] START regressor__max_depth=4, regressor__n_estimators=200......... [CV 5/5; 6/9] END regressor__max_depth=4, regressor__n_estimators=200;, score=(train=0.996, test=0.017) total time= 4.4s [CV 1/5; 7/9] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 1/5; 7/9] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.999, test=0.263) total time= 1.2s [CV 2/5; 7/9] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 2/5; 7/9] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.998, test=-3.987) total time= 1.2s [CV 3/5; 7/9] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 3/5; 7/9] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.999, test=0.437) total time= 1.9s [CV 4/5; 7/9] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 4/5; 7/9] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.994, test=0.346) total time= 4.0s [CV 5/5; 7/9] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 5/5; 7/9] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.980, test=0.024) total time= 1.4s [CV 1/5; 8/9] START regressor__max_depth=5, regressor__n_estimators=100......... [CV 1/5; 8/9] END regressor__max_depth=5, regressor__n_estimators=100;, score=(train=1.000, test=0.265) total time= 2.9s [CV 2/5; 8/9] START regressor__max_depth=5, regressor__n_estimators=100......... [CV 2/5; 8/9] END regressor__max_depth=5, regressor__n_estimators=100;, score=(train=0.999, test=-4.010) total time= 4.1s [CV 3/5; 8/9] START regressor__max_depth=5, regressor__n_estimators=100......... [CV 3/5; 8/9] END regressor__max_depth=5, regressor__n_estimators=100;, score=(train=1.000, test=0.437) total time= 7.9s [CV 4/5; 8/9] START regressor__max_depth=5, regressor__n_estimators=100......... [CV 4/5; 8/9] END regressor__max_depth=5, regressor__n_estimators=100;, score=(train=0.998, test=0.360) total time= 2.8s [CV 5/5; 8/9] START regressor__max_depth=5, regressor__n_estimators=100......... [CV 5/5; 8/9] END regressor__max_depth=5, regressor__n_estimators=100;, score=(train=0.993, test=0.027) total time= 3.6s [CV 1/5; 9/9] START regressor__max_depth=5, regressor__n_estimators=200......... [CV 1/5; 9/9] END regressor__max_depth=5, regressor__n_estimators=200;, score=(train=1.000, test=0.266) total time= 8.7s [CV 2/5; 9/9] START regressor__max_depth=5, regressor__n_estimators=200......... [CV 2/5; 9/9] END regressor__max_depth=5, regressor__n_estimators=200;, score=(train=1.000, test=-4.012) total time= 3.2s [CV 3/5; 9/9] START regressor__max_depth=5, regressor__n_estimators=200......... [CV 3/5; 9/9] END regressor__max_depth=5, regressor__n_estimators=200;, score=(train=1.000, test=0.437) total time= 3.1s [CV 4/5; 9/9] START regressor__max_depth=5, regressor__n_estimators=200......... [CV 4/5; 9/9] END regressor__max_depth=5, regressor__n_estimators=200;, score=(train=0.999, test=0.365) total time= 3.3s [CV 5/5; 9/9] START regressor__max_depth=5, regressor__n_estimators=200......... [CV 5/5; 9/9] END regressor__max_depth=5, regressor__n_estimators=200;, score=(train=0.998, test=0.028) total time= 3.5s ---------- iter: 1 n_candidates: 3 n_resources: 2625 Fitting 5 folds for each of 3 candidates, totalling 15 fits [CV 1/5; 1/3] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 1/5; 1/3] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.982, test=0.108) total time= 5.0s [CV 2/5; 1/3] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 2/5; 1/3] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.946, test=0.192) total time= 4.9s [CV 3/5; 1/3] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 3/5; 1/3] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.984, test=0.063) total time= 4.5s [CV 4/5; 1/3] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 4/5; 1/3] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.997, test=0.509) total time= 4.9s [CV 5/5; 1/3] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 5/5; 1/3] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.982, test=0.219) total time= 4.9s [CV 1/5; 2/3] START regressor__max_depth=5, regressor__n_estimators=200......... [CV 1/5; 2/3] END regressor__max_depth=5, regressor__n_estimators=200;, score=(train=0.996, test=0.115) total time= 21.1s [CV 2/5; 2/3] START regressor__max_depth=5, regressor__n_estimators=200......... [CV 2/5; 2/3] END regressor__max_depth=5, regressor__n_estimators=200;, score=(train=0.992, test=0.199) total time= 19.6s [CV 3/5; 2/3] START regressor__max_depth=5, regressor__n_estimators=200......... [CV 3/5; 2/3] END regressor__max_depth=5, regressor__n_estimators=200;, score=(train=0.998, test=0.087) total time= 20.2s [CV 4/5; 2/3] START regressor__max_depth=5, regressor__n_estimators=200......... [CV 4/5; 2/3] END regressor__max_depth=5, regressor__n_estimators=200;, score=(train=0.999, test=0.505) total time= 17.9s [CV 5/5; 2/3] START regressor__max_depth=5, regressor__n_estimators=200......... [CV 5/5; 2/3] END regressor__max_depth=5, regressor__n_estimators=200;, score=(train=0.997, test=0.159) total time= 17.9s [CV 1/5; 3/3] START regressor__max_depth=3, regressor__n_estimators=50.......... [CV 1/5; 3/3] END regressor__max_depth=3, regressor__n_estimators=50;, score=(train=0.952, test=0.181) total time= 3.4s [CV 2/5; 3/3] START regressor__max_depth=3, regressor__n_estimators=50.......... [CV 2/5; 3/3] END regressor__max_depth=3, regressor__n_estimators=50;, score=(train=0.871, test=0.161) total time= 3.6s [CV 3/5; 3/3] START regressor__max_depth=3, regressor__n_estimators=50.......... [CV 3/5; 3/3] END regressor__max_depth=3, regressor__n_estimators=50;, score=(train=0.956, test=0.057) total time= 3.5s [CV 4/5; 3/3] START regressor__max_depth=3, regressor__n_estimators=50.......... [CV 4/5; 3/3] END regressor__max_depth=3, regressor__n_estimators=50;, score=(train=0.984, test=0.437) total time= 3.1s [CV 5/5; 3/3] START regressor__max_depth=3, regressor__n_estimators=50.......... [CV 5/5; 3/3] END regressor__max_depth=3, regressor__n_estimators=50;, score=(train=0.937, test=0.019) total time= 3.5s ---------- iter: 2 n_candidates: 1 n_resources: 7875 Fitting 5 folds for each of 1 candidates, totalling 5 fits [CV 1/5; 1/1] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 1/5; 1/1] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.968, test=0.217) total time= 32.0s [CV 2/5; 1/1] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 2/5; 1/1] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.963, test=-0.367) total time= 32.0s [CV 3/5; 1/1] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 3/5; 1/1] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.952, test=0.170) total time= 33.3s [CV 4/5; 1/1] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 4/5; 1/1] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.967, test=0.255) total time= 31.8s [CV 5/5; 1/1] START regressor__max_depth=5, regressor__n_estimators=50.......... [CV 5/5; 1/1] END regressor__max_depth=5, regressor__n_estimators=50;, score=(train=0.967, test=-1.139) total time= 33.1s
Pudimos ver que mejor modelo mejora los resultados obtenidos en el baseline.
Se pudo resolver el problema bastante bien superando nuesto propio baseline optimizando sus parametros y que deberiamos usar como input.
El baseline mejoró bastante, pasando de un f1 de 0.295 y un r2 de 0.182 en el dataset de test evaluado en codelab, a un f1 de 0.35 y un r2 de 0.18 despues de las optimizaciones. La verdad yo quede bastante conforme con los resultados, siempre se puede mejorar más aun pero este fue un resultado satisfactorio.
A mi parecer es un problema complejo pero si se me ocurren algunas maneras de modelar mejor el problema, como tratar de usar el texto libre de las descripciones utilizando NLP, contar el número de juegos de cada publicador y de cada developer y ver si eso ayudaba a la clasificación y la regresión.
Siguiendo un poco los resultados obtenidos en Codelab, me doy cuenta que me fue mejor en clasificación que en regresión y esto lo asocio a mi experiencia, ya que siempre he tenido que hacer clasificaciones y no regresiones y por ende no sabia bien que modelo utilizar. Utilice XGBoost porque en internet salia que ultimamento habia logrado excelentes resultados en data tabular, pero al no conocerlo bien tampoco se a la perfección como hacerle el finetuning.
El proyecto me gustó bastante y la dinamica de la competencia es una que ayuda a poder evaluar los resultados que esta teniendo en comparación al resto lo que ayuda bastante para presionarse. En general me gusto bastante y apreció la idea. Lo que me habria gustado haber aprendido es un poco mas sobre regresiones en el curso en general, porque siempre casi todo machine learning lo enfocan exclusivamente en clasificación.
Para subir los resultados obtenidos a la pagina de CodaLab utilice la función generateFiles entregada mas abajo. Esto es debido a que usted deberá generar archivos que respeten extrictamente el formato de CodaLab, de lo contario los resultados no se veran reflejados en la pagina de la competencia.
Para los resultados obtenidos en su modelo de clasificación y regresión, estos serán guardados en un archivo zip que contenga los archivos predicctions_clf.txt para la clasificación y predicctions_rgr.clf para la regresión. Los resultados, como se comento antes, deberan ser obtenidos en base al dataset test.pickle y en cada una de las lineas deberan presentar las predicciones realizadas.
Ejemplos de archivos:
[ ] predicctions_clf.txt
Mostly Positive
Mostly Positive
Negative
Positive
Negative
Positive
...
[ ] predicctions_rgr.txt
16103.58
16103.58
16041.89
9328.62
107976.03
194374.08
...
from zipfile import ZipFile
import os
def generateFiles(predict_data, clf_pipe, rgr_pipe):
"""Genera los archivos a subir en CodaLab
Input
predict_data: Dataframe con los datos de entrada a predecir
clf_pipe: pipeline del clf
rgr_pipe: pipeline del rgr
Ouput
archivo de txt
"""
y_pred_clf = clf_pipe.predict(predict_data)
y_pred_rgr = rgr_pipe.predict(predict_data)
with open('./predictions_clf.txt', 'w') as f:
for item in y_pred_clf:
f.write("%s\n" % item)
with open('./predictions_rgr.txt', 'w') as f:
for item in y_pred_rgr:
f.write("%s\n" % item)
with ZipFile('predictions.zip', 'w') as zipObj2:
zipObj2.write('predictions_rgr.txt')
zipObj2.write('predictions_clf.txt')
os.remove("predictions_rgr.txt")
os.remove("predictions_clf.txt")
generateFiles(df_test, gs_clf, gs_reg)